Exploring distance matrix between samples under various conditions
Required for developing double-KNN correction
#collapse-hide
dosagefile = '/cbscratch/sbanerj/gtex_pca/gtex_v8_filtered.dosage.raw'
dosage_numpy_file = '/cbscratch/sbanerj/gtex_pca/gtex_dosage.npy'
expression_file = '/scratch/sbanerj/trans-eqtl/input/gtex_v8/expression/gtex_ms_raw_std_protein_coding_lncRNA.txt'
#collapse-hide
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from scipy import stats
import os
from scipy.cluster import hierarchy as hc
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.axes_grid1 import make_axes_locatable
from utils import mpl_stylesheet
mpl_stylesheet.banskt_presentation(fontfamily = 'latex-clearsans', fontsize = 18, colors = 'banskt', dpi = 300)
#collapse-hide
def read_gtex(filename): # returns N x G gene expression
expr_list = list()
donor_list = list()
gene_list = list()
with open(filename) as mfile:
donor_list = mfile.readline().strip().split("\t")[1:]
for line in mfile:
linesplit = line.strip().split("\t")
gene = linesplit[0].strip()
gene_list.append(gene)
expr = np.array([float(x) for x in linesplit[1:]])
expr_list.append(expr)
expr = np.transpose(np.array(expr_list))
return expr, donor_list, gene_list
if not os.path.isfile(dosage_numpy_file):
dosage = np.loadtxt(dosagefile, delimiter=' ', skiprows=1, usecols=range(6, 97612))
np.save(dosage_numpy_file, dosage)
else:
dosage = np.load(dosage_numpy_file)
gtsamples = list()
with open (dosagefile, 'r') as infile:
next(infile)
for line in infile:
gtsamples.append(line.strip().split()[1])
gx, gxsamples, _ = read_gtex(expression_file)
sampleidx = [gtsamples.index(x) for x in gxsamples] # assumes all expression samples have genotype
dreduce = dosage[sampleidx, :]
gt = dreduce - np.mean(dreduce, axis = 0).reshape(1, -1)
print(f'{len(sampleidx)} samples, {gx.shape[1]} genes, {gt.shape[1]} SNPs.')
print(f'Centered and normalized genotype and expression. Samples in same order as `gxsamples`')
#collapse-hide
def get_pca(x, K):
pca = PCA(n_components=K)
pca.fit(x) # requires N x P (n_samples, n_features)
x_pca = pca.transform(x)
return x_pca
def get_distance(a, b):
return np.linalg.norm(a - b)
def distance_matrix(x_pca):
nsample = x_pca.shape[0]
distance_matrix = np.zeros((nsample, nsample))
for i in range(nsample):
for j in range(i+1, nsample):
dist = get_distance(x_pca[i,:], x_pca[j,:])
distance_matrix[i, j] = dist
distance_matrix[j, i] = dist
return distance_matrix
def map_distance_matrix(dm, samples, target_samples):
N = len(target_samples)
newdm = np.zeros((N, N))
newdm[:] = np.nan
for i in range(N):
if target_samples[i] in samples:
newdm[i, i] = 0 # diagonal is always zero
iold = samples.index(target_samples[i])
for j in range(i+1, N):
if target_samples[j] in samples:
jold = samples.index(target_samples[j])
newdm[i, j] = dm[iold, jold]
newdm[j, i] = dm[jold, iold]
return newdm
def knn(gx, gt, dm, K):
assert (gx.shape[0] == gt.shape[0])
N = gx.shape[0]
gx_knn = np.zeros_like(gx)
gt_knn = np.zeros_like(gt)
for i in range(N):
#neighbors = np.argsort(distance_matrix[i, :kneighbor + 1])
neighbors = np.argsort(dm[i, :])[:K + 1][1:]
gx_knn[i, :] = gx[i, :] - np.mean(gx[neighbors, :], axis = 0)
gt_knn[:, i] = gt[:, i] - np.mean(gt[:, neighbors[1:]], axis = 1)
return gx_knn, gt_knn
def remove_nfirst_pcs(X, n=1):
mu = np.mean(X, axis = 0)
Xnorm = X - mu
U, S, Vt = np.linalg.svd(Xnorm, full_matrices=False)
Xhat = U[:, n:] @ np.diag(S[n:]) @ Vt[n:, :]
Xhat += mu
return Xhat
#collapse-hide
def plot_distance_matrices(dmA, dmB, norms = None):
'''
provide norms, if required, as norms = (norm1, norm2)
where,
norm1 = matplotlib.colors.DivergingNorm(vmin=10., vcenter=90., vmax=170.)
norm2 = matplotlib.colors.DivergingNorm(vmin=0., vcenter=90., vmax=300.)
'''
fig = plt.figure(figsize = (12, 6))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
# the zero distance between the same samples
# is bad for the color scale.
dmA[np.diag_indices(dmA.shape[0])] = np.nan
dmB[np.diag_indices(dmB.shape[0])] = np.nan
cmap1 = plt.get_cmap("YlOrRd")
cmap1.set_bad('w')
cmap2 = plt.get_cmap("YlGnBu")
cmap2.set_bad('w')
if norms is not None:
norm1 = norms[0]
norm2 = norms[1]
im1 = ax1.imshow(dmA, cmap = cmap1, norm = norm1, interpolation='nearest')
im2 = ax2.imshow(dmB, cmap = cmap2, norm = norm2, interpolation='nearest')
else:
im1 = ax1.imshow(dmA, cmap = cmap1, interpolation='nearest')
im2 = ax2.imshow(dmB, cmap = cmap2, interpolation='nearest')
divider = make_axes_locatable(ax1)
cax = divider.append_axes("right", size="5%", pad=0.2)
cbar = plt.colorbar(im1, cax=cax, fraction = 0.1)
divider = make_axes_locatable(ax2)
cax = divider.append_axes("right", size="5%", pad=0.2)
cbar = plt.colorbar(im2, cax=cax, fraction = 0.1)
ax1.set_title("Genotype space", pad = 20)
ax2.set_title("Expression space", pad = 20)
plt.tight_layout()
return fig
#collapse-show
# Before KNN
dm_gt = distance_matrix(get_pca(gt, 20))
dm_gx = distance_matrix(get_pca(gx, 30))
# Expression KNN
K = 30
gx_knn, gt_knn = knn(gx, gt, dm_gx, K)
dm_gt_knn = distance_matrix(get_pca(gt_knn, 40))
dm_gx_knn = distance_matrix(get_pca(gx_knn, gx_knn.shape[0]))
# Double KNN
K1 = 10
K2 = 30
gx_knn1, gt_knn1 = knn(gx, gt, dm_gt, K1)
dm_gx1 = distance_matrix(get_pca(gx_knn1, gx_knn1.shape[0]))
gx_knn2, gt_knn2 = knn(gx_knn1, gt_knn1, dm_gx1, K2)
dm_gt_knn2 = distance_matrix(get_pca(gt_knn2, 40))
dm_gx_knn2 = distance_matrix(get_pca(gx_knn2, gx_knn2.shape[0]))
#collapse-hide
o1 = hc.leaves_list(hc.linkage(dm_gt, method = 'centroid'))
#collapse-hide
norm1 = matplotlib.colors.DivergingNorm(vmin=10., vcenter=90., vmax=170.)
norm2 = matplotlib.colors.DivergingNorm(vmin=0., vcenter=75., vmax=300.)
norms = (norm1, norm2)
#collapse-show
mgt = dm_gt[o1, :][:, o1]
mgx = dm_gx[o1, :][:, o1]
fig = plot_distance_matrices(mgt, mgx, norms = norms)
fig.suptitle("Distance between samples before KNN")
plt.show()
#collapse-show
mgt = dm_gt_knn[o1, :][:, o1]
mgx = dm_gx_knn[o1, :][:, o1]
fig = plot_distance_matrices(mgt, mgx, norms = norms)
fig.suptitle("Distance between samples after KNN")
plt.show()
#collapse-show
mgt = dm_gt_knn2[o1, :][:, o1]
mgx = dm_gx_knn2[o1, :][:, o1]
fig = plot_distance_matrices(mgt, mgx, norms = norms)
fig.suptitle("Distance between samples after double KNN")
plt.show()
#collapse-hide
o2 = hc.leaves_list(hc.linkage(dm_gx, method='centroid'))
#collapse-show
mgt = dm_gt[o2, :][:, o2]
mgx = dm_gx[o2, :][:, o2]
fig = plot_distance_matrices(mgt, mgx, norms = norms)
fig.suptitle("Distance between samples before KNN")
plt.show()
#collapse-show
mgt = dm_gt_knn[o2, :][:, o2]
mgx = dm_gx_knn[o2, :][:, o2]
fig = plot_distance_matrices(mgt, mgx, norms = norms)
fig.suptitle("Distance between samples after KNN")
plt.show()
#collapse-show
mgt = dm_gt_knn2[o2, :][:, o2]
mgx = dm_gx_knn2[o2, :][:, o2]
fig = plot_distance_matrices(mgt, mgx, norms = norms)
fig.suptitle("Distance between samples after double KNN")
plt.show()
#collapse-show
mgt = dm_gt_knn2[o1, :][:, o1] - dm_gt_knn[o1, :][:, o1]
mgx = dm_gx_knn2[o1, :][:, o1] - dm_gx_knn[o1, :][:, o1]
fig = plot_distance_matrices(mgt, mgx)
plt.show()
Calculation with modified distance matrix
Due to strong patterns in the distance matrix, we hypothesized that it could be due to technical noise among the samples. Hence, they are preventing finding the correct neighbors.
Below, I calculated the distance matrix and subtracted the first 2 principal components from the distance matrix. The neighbors for KNN correction were then calculated from the modified distance matrix.
Variables have been reassigned. Poor coding. Be careful!
#collapse-show
# Before KNN
dm_gt = distance_matrix(get_pca(gt, 20))
dm_gx = distance_matrix(get_pca(gx, 30))
dm_gx_corr = remove_nfirst_pcs(dm_gx, n=2)
# Expression KNN
K = 30
gx_knn, gt_knn = knn(gx, gt, dm_gx_corr, K)
dm_gt_knn = distance_matrix(get_pca(gt_knn, 40))
dm_gx_knn = distance_matrix(get_pca(gx_knn, gx_knn.shape[0]))
# Double KNN
K1 = 10
K2 = 30
gx_knn1, gt_knn1 = knn(gx, gt, dm_gt, K1)
dm_gx1 = distance_matrix(get_pca(gx_knn1, gx_knn1.shape[0]))
dm_gx1_corr = remove_nfirst_pcs(dm_gx1, n=2)
gx_knn2, gt_knn2 = knn(gx_knn1, gt_knn1, dm_gx1_corr, K2)
dm_gt_knn2 = distance_matrix(get_pca(gt_knn2, 40))
dm_gx_knn2 = distance_matrix(get_pca(gx_knn2, gx_knn2.shape[0]))
#collapse-hide
o2 = hc.leaves_list(hc.linkage(dm_gx_corr, method = 'centroid'))
#collapse-hide
norm1 = matplotlib.colors.DivergingNorm(vmin=10., vcenter=90., vmax=170.)
norm2 = matplotlib.colors.DivergingNorm(vmin=0., vcenter=80., vmax=300.)
norms = (norm1, norm2)
#collapse-hide
mgt = dm_gt[o2, :][:, o2]
mgx = dm_gx_corr[o2, :][:, o2]
fig = plot_distance_matrices(mgt, mgx, norms = norms)
fig.suptitle("Modified distance matrix before KNN")
plt.show()
#collapse-hide
mgt = dm_gt_knn[o2, :][:, o2]
mgx = dm_gx_knn[o2, :][:, o2]
fig = plot_distance_matrices(mgt, mgx, norms = norms)
fig.suptitle("Original distance matrix after KNN")
plt.show()
mgt = dm_gt_knn2[o2, :][:, o2]
mgx = dm_gx_knn2[o2, :][:, o2]
fig = plot_distance_matrices(mgt, mgx, norms = norms)
fig.suptitle("Original distance matrix after double KNN")
plt.show()